library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.2.0 stringr_1.2.0 dplyr_0.7.4 purrr_0.2.4
## [5] readr_1.1.1 tidyr_0.8.0 tibble_1.4.2 ggplot2_2.2.1
## [9] tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.15 cellranger_1.1.0 pillar_1.1.0 compiler_3.4.3
## [5] plyr_1.8.4 bindr_0.1 tools_3.4.3 digest_0.6.15
## [9] lubridate_1.7.1 jsonlite_1.5 evaluate_0.10.1 nlme_3.1-131
## [13] gtable_0.2.0 lattice_0.20-35 pkgconfig_2.0.1 rlang_0.1.6
## [17] psych_1.7.8 cli_1.0.0 rstudioapi_0.7 yaml_2.1.16
## [21] parallel_3.4.3 haven_1.1.1 bindrcpp_0.2 xml2_1.2.0
## [25] httr_1.3.1 knitr_1.19 hms_0.4.1 rprojroot_1.3-2
## [29] grid_3.4.3 glue_1.2.0 R6_2.2.2 readxl_1.0.0
## [33] foreign_0.8-69 rmarkdown_1.8 modelr_0.1.1 reshape2_1.4.3
## [37] magrittr_1.5 backports_1.1.2 scales_0.5.0 htmltools_0.3.6
## [41] rvest_0.3.2 assertthat_0.2.0 mnormt_1.5-5 colorspace_1.3-2
## [45] stringi_1.1.6 lazyeval_0.2.1 munsell_0.4.3 broom_0.4.3
## [49] crayon_1.3.4
My_data.name___is.too0ooLong <- c( 1 , 2 , 3 )
My_data.name___is.too0ooLong
## [1] 1 2 3
y_string <- c('has', 'an', 'error', 'in', 'it')
y_string
## [1] "has" "an" "error" "in" "it"
my_vector <- c(1, 2, '3', '4', 5)
my_vector
## [1] "1" "2" "3" "4" "5"
typeof(my_vector)
## [1] "character"
file_path <- '~/compscix-415-2-assignments/rail_trail.txt'
rail_trail <- read_delim(delim = '|', file = file_path)
## Parsed with column specification:
## cols(
## hightemp = col_integer(),
## lowtemp = col_integer(),
## avgtemp = col_double(),
## spring = col_integer(),
## summer = col_integer(),
## fall = col_integer(),
## cloudcover = col_double(),
## precip = col_double(),
## volume = col_integer(),
## weekday = col_integer()
## )
glimpse(rail_trail)
## Observations: 90
## Variables: 10
## $ hightemp <int> 83, 73, 74, 95, 44, 69, 66, 66, 80, 79, 78, 65, 41,...
## $ lowtemp <int> 50, 49, 52, 61, 52, 54, 39, 38, 55, 45, 55, 48, 49,...
## $ avgtemp <dbl> 66.5, 61.0, 63.0, 78.0, 48.0, 61.5, 52.5, 52.0, 67....
## $ spring <int> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, ...
## $ summer <int> 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, ...
## $ fall <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...
## $ cloudcover <dbl> 7.6, 6.3, 7.5, 2.6, 10.0, 6.6, 2.4, 0.0, 3.8, 4.1, ...
## $ precip <dbl> 0.00, 0.29, 0.32, 0.00, 0.14, 0.02, 0.00, 0.00, 0.0...
## $ volume <int> 501, 419, 397, 385, 200, 375, 417, 629, 533, 547, 4...
## $ weekday <int> 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, ...
saveRDS(rail_trail, file = "~/compscix-415-2-assignments/rail_trail.rds")
rail_trail<- readRDS(file ='~/compscix-415-2-assignments/rail_trail.rds')
glimpse(rail_trail)
## Observations: 90
## Variables: 10
## $ hightemp <int> 83, 73, 74, 95, 44, 69, 66, 66, 80, 79, 78, 65, 41,...
## $ lowtemp <int> 50, 49, 52, 61, 52, 54, 39, 38, 55, 45, 55, 48, 49,...
## $ avgtemp <dbl> 66.5, 61.0, 63.0, 78.0, 48.0, 61.5, 52.5, 52.0, 67....
## $ spring <int> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, ...
## $ summer <int> 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, ...
## $ fall <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...
## $ cloudcover <dbl> 7.6, 6.3, 7.5, 2.6, 10.0, 6.6, 2.4, 0.0, 3.8, 4.1, ...
## $ precip <dbl> 0.00, 0.29, 0.32, 0.00, 0.14, 0.02, 0.00, 0.00, 0.0...
## $ volume <int> 501, 419, 397, 385, 200, 375, 417, 629, 533, 547, 4...
## $ weekday <int> 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, ...
library(forcats)
president <- tribble(
~age, ~Agree, ~value,
"under45", "yes", 79,
"under45", "no", 16,
"b45-64", "yes", 69,
"b45-64", "no", 22,
"65over", "yes", 44,
"65over", "no", 39,
)
president2 <-tribble(
~gender, ~Agree, ~value,
"Men", "yes", 65,
"Men", "no", 25,
"Women", "yes",72,
"Women", "no", 20,
)
president <- president %>% mutate(age = fct_relevel(age, 'under45', 'b45-64', '65over'))
president
## # A tibble: 6 x 3
## age Agree value
## <fct> <chr> <dbl>
## 1 under45 yes 79.0
## 2 under45 no 16.0
## 3 b45-64 yes 69.0
## 4 b45-64 no 22.0
## 5 65over yes 44.0
## 6 65over no 39.0
ggplot(data=president)+
geom_bar(mapping=aes(x=age, y=value, fill=Agree), position="dodge", stat="identity") +
labs(x="Age group", y="Percentage", title="% respondents who agree a woman will be president in their lifetime")
ggplot(data=president2)+
geom_bar(mapping=aes(x=gender, y=value, fill=Agree), position="dodge", stat="identity") +
labs(x="gender", y="Percentage", title="% respondents who agree a woman will be president in their lifetime")
library(tidyverse)
ggplot(data = diamonds, mapping = aes(x = cut, y = carat, fill=color))+
geom_boxplot(position='identity')+
coord_flip() +
xlab("CUT OF DIAMOND")+
ylab("CARAT OF DIAMOND")
ggplot(data = diamonds, mapping = aes(x = reorder(cut, carat, FUN=median), y = carat, fill=color))+
geom_boxplot(position='identity')+
coord_flip() +
xlab("CUT OF DIAMOND")+
ylab("CARAT OF DIAMOND")
ggplot(data = diamonds, mapping = aes(x = reorder(cut, carat), y = carat, fill=color))+
geom_boxplot(position='identity')+
coord_flip() +
xlab("CUT OF DIAMOND")+
ylab("CARAT OF DIAMOND")
table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
table2 %>%
spread(key=type, value=count)
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
glimpse(txhousing)
## Observations: 8,602
## Variables: 9
## $ city <chr> "Abilene", "Abilene", "Abilene", "Abilene", "Abilene...
## $ year <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000...
## $ month <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5...
## $ sales <dbl> 72, 98, 130, 98, 141, 156, 152, 131, 104, 101, 100, ...
## $ volume <dbl> 5380000, 6505000, 9285000, 9730000, 10590000, 139100...
## $ median <dbl> 71400, 58700, 58100, 68600, 67300, 66900, 73500, 750...
## $ listings <dbl> 701, 746, 784, 785, 794, 780, 742, 765, 771, 764, 72...
## $ inventory <dbl> 6.3, 6.6, 6.8, 6.9, 6.8, 6.6, 6.2, 6.4, 6.5, 6.6, 6....
## $ date <dbl> 2000.000, 2000.083, 2000.167, 2000.250, 2000.333, 20...
txhousing %>%
group_by(city) %>%
summarise(
count = n())
## # A tibble: 46 x 2
## city count
## <chr> <int>
## 1 Abilene 187
## 2 Amarillo 187
## 3 Arlington 187
## 4 Austin 187
## 5 Bay Area 187
## 6 Beaumont 187
## 7 Brazoria County 187
## 8 Brownsville 187
## 9 Bryan-College Station 187
## 10 Collin County 187
## # ... with 36 more rows
txhousing %>%
count(city)
## # A tibble: 46 x 2
## city n
## <chr> <int>
## 1 Abilene 187
## 2 Amarillo 187
## 3 Arlington 187
## 4 Austin 187
## 5 Bay Area 187
## 6 Beaumont 187
## 7 Brazoria County 187
## 8 Brownsville 187
## 9 Bryan-College Station 187
## 10 Collin County 187
## # ... with 36 more rows
txhousing %>%
group_by(city, month, year) %>%
summarise(sales = mean(sales, na.rm = TRUE)) %>%
arrange(desc(sales))
## # A tibble: 8,602 x 4
## # Groups: city, month [552]
## city month year sales
## <chr> <int> <int> <dbl>
## 1 Houston 7 2015 8945
## 2 Houston 6 2006 8628
## 3 Houston 7 2013 8468
## 4 Houston 6 2015 8449
## 5 Houston 5 2013 8439
## 6 Houston 6 2014 8391
## 7 Houston 7 2014 8391
## 8 Houston 8 2014 8167
## 9 Houston 8 2013 8155
## 10 Houston 5 2006 8040
## # ... with 8,592 more rows
ggplot(data = txhousing) +
geom_point(mapping = aes(x = listings, y = sales)) +
geom_smooth(mapping = aes(x = listings, y = sales))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1426 rows containing non-finite values (stat_smooth).
## Warning: Removed 1426 rows containing missing values (geom_point).
ggplot(data = txhousing) +
geom_point(mapping = aes(x = listings, y = sales), na.rm=FALSE) +
geom_smooth(mapping = aes(x = listings, y = sales), na.rm=FALSE)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1426 rows containing non-finite values (stat_smooth).
## Warning: Removed 1426 rows containing missing values (geom_point).
ggplot(data = txhousing) +
geom_point(mapping = aes(x = listings, y = sales), na.rm=TRUE) +
geom_smooth(mapping = aes(x = listings, y = sales), na.rm=TRUE)
## `geom_smooth()` using method = 'gam'
ggplot(data = txhousing) +
geom_point(mapping = aes(x = listings, y = sales)) +
facet_wrap(~ year, nrow = 3)
## Warning: Removed 1426 rows containing missing values (geom_point).
ggplot(data = txhousing) +
geom_point(mapping = aes(x = listings, y = sales)) +
facet_wrap(~ city)
## Warning: Removed 1426 rows containing missing values (geom_point).
Check your assumption and show your work. ## 5) What proportion of sales is missing for each city?
sales_missing <- txhousing %>%
filter(is.na(sales))
txhousing%>% group_by(city) %>% summarize(sales_city = n())
## # A tibble: 46 x 2
## city sales_city
## <chr> <int>
## 1 Abilene 187
## 2 Amarillo 187
## 3 Arlington 187
## 4 Austin 187
## 5 Bay Area 187
## 6 Beaumont 187
## 7 Brazoria County 187
## 8 Brownsville 187
## 9 Bryan-College Station 187
## 10 Collin County 187
## # ... with 36 more rows
sales_missing %>% group_by(city) %>% summarize(missing_count = n())
## # A tibble: 20 x 2
## city missing_count
## <chr> <int>
## 1 Brazoria County 14
## 2 Brownsville 2
## 3 Corpus Christi 1
## 4 Galveston 1
## 5 Harlingen 25
## 6 Kerrville 104
## 7 Killeen-Fort Hood 1
## 8 Laredo 36
## 9 Longview-Marshall 12
## 10 Lubbock 1
## 11 McAllen 2
## 12 Midland 75
## 13 Nacogdoches 11
## 14 Odessa 72
## 15 Port Arthur 2
## 16 San Marcos 46
## 17 South Padre Island 116
## 18 Temple-Belton 11
## 19 Texarkana 17
## 20 Waco 19
sales_m500 <- txhousing %>%
filter(sales > 500)
sales_m500 <- txhousing %>%
filter(sales > 500)
ggplot(data = sales_m500) +
geom_boxplot(mapping = aes(x = median, y=city), position="identity")
txhousing %>% filter(sales > 500) %>%
group_by(city) %>%
summarise(prop_median = mean(median, na.rm = TRUE)) %>%
ggplot(aes(y = prop_median, x = city)) +
geom_bar(stat = 'identity')+ coord_flip()
sales_m500 <- txhousing %>%
filter(sales > 500)
ggplot(data = sales_m500) +
geom_point(mapping = aes(x = sales, y = median))+
geom_smooth(mapping = aes(x = sales, y = median)) +
facet_wrap(~ city)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 505.7
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 9.295
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2529.6
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 5
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 5
## Warning in sqrt(sum.squares/one.delta): NaNs produced
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small.
## fewer data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 505.7
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 9.295
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 2529.6
## Warning in stats::qt(level/2 + 0.5, pred$df): NaNs produced
sales_m500 <- txhousing %>%
filter(sales > 500)
ggplot(data = sales_m500) +
geom_point(mapping = aes(x = sales, y = median)) +
facet_wrap(~ city)